Anumeha Mishra - CS 544 Term Project December 7, 2020

Introduction

This dataset has been taken from kaggle and the link is as below: https://www.kaggle.com/sakshigoyal7/credit-card-customers

This dataset contains personal information of the customers such as their age, marital status, income,etc.

This dataset contains 10,128 rows and 20 columns.

The columns attributes are :

CLIENTNUM, Attrition flag,Customer_Age , Gender , Dependent_count , Education_Level, Marital_Status, Income_Category, Card_Category, Months_on_book, Total_Relationship_Count, Months_Inactive, Contacts_Count,Credit_Limit,Total_Revolving_Bal, Total_Amt_Chng_Q4_Q1,Total_Trans_Amt,Total_Trans_Ct,Total_Ct_Chng_Q4_Q1,Avg_Utilization_Ratio

Objective

The objective of this project is to gain an insight on customers banking and show visualizations on different trends based on their age, gender and marital status.

Preparing the data

You should consider writing a function whenever you’ve copied and pasted a block of code more than twice (i.e. you now have three copies of the same code). For example, take a look at this code. What does it do?

data <- as.data.frame(read.csv("/Users/anumehamishra/Desktop/BankChurners.csv", header=T, sep=",", na.strings = c("","NA")))
head(data)
##   CLIENTNUM    Attrition_Flag Customer_Age Gender Dependent_count
## 1 768805383 Existing Customer           45      M               3
## 2 818770008 Existing Customer           49      F               5
## 3 713982108 Existing Customer           51      M               3
## 4 769911858 Existing Customer           40      F               4
## 5 709106358 Existing Customer           40      M               3
## 6 713061558 Existing Customer           44      M               2
##   Education_Level Marital_Status Income_Category Card_Category Months_on_book
## 1     High School        Married     $60K - $80K          Blue             39
## 2        Graduate         Single  Less than $40K          Blue             44
## 3        Graduate        Married    $80K - $120K          Blue             36
## 4     High School        Unknown  Less than $40K          Blue             34
## 5      Uneducated        Married     $60K - $80K          Blue             21
## 6        Graduate        Married     $40K - $60K          Blue             36
##   Total_Relationship_Count Months_Inactive Contacts_Count Credit_Limit
## 1                        5               1              3        12691
## 2                        6               1              2         8256
## 3                        4               1              0         3418
## 4                        3               4              1         3313
## 5                        5               1              0         4716
## 6                        3               1              2         4010
##   Total_Revolving_Bal Total_Amt_Chng_Q4_Q1 Total_Trans_Amt Total_Trans_Ct
## 1                 777                1.335            1144             42
## 2                 864                1.541            1291             33
## 3                   0                2.594            1887             20
## 4                2517                1.405            1171             20
## 5                   0                2.175             816             28
## 6                1247                1.376            1088             24
##   Total_Ct_Chng_Q4_Q1 Avg_Utilization_Ratio
## 1               1.625                 0.061
## 2               3.714                 0.105
## 3               2.333                 0.000
## 4               2.333                 0.760
## 5               2.500                 0.000
## 6               0.846                 0.311

Analyzing the data

PART 1
Categorical 1- Finding the number of males and females customers
table(data$Gender)
## 
##    F    M 
## 5358 4769
barplot(table(data$Gender), col = "cyan", ylim = c(0,6000), las = 2, xlab = "Gender",ylab = "Number of customers")

Finding the martial status distributions of the customers
table(data$Marital_Status)
## 
## Divorced  Married   Single  Unknown 
##      748     4687     3943      749
barplot(table(data$Marital_Status), col = "cyan", ylim = c(0,5000), las = 2, xlab = "Marital_Status",ylab = "Number of customers")

Numerical 1- Finding the number of males and females customers who have dependents as = 5
Dependents <- data[data$Dependent_count == '5', ]
table(Dependents$Gender)
## 
##   F   M 
## 225 199
barplot(table(Dependents$Gender), col = "red", ylim = c(0,250), density=c(10,20),las = 2, xlab = "Dependents = 5", ylab = "Number of customers")

PART 2
Categorical 2- Calculating distribution of education level among customers
Education_Level <- table(data$Education_Level)
slice.labels <- names(Education_Level)
slice.percents <- round(Education_Level/sum(Education_Level)*100)
slice.labels <- paste(slice.labels, slice.percents)
slice.labels <- paste(slice.labels, "%", sep = "")
pie(Education_Level, labels = slice.labels, col = hcl(c(0, 60, 120)))

PART 3
Numerical - numeric data and analyzing distribution on it
a. Finding the summary of my data on the basis of Age
Boxplot showing the ages of customers
fivenum(data$Customer_Age)
## [1] 26 41 46 52 73
boxplot(data$Customer_Age, horizontal = TRUE, xaxt = "n", xlab = "Age of customers", col=hcl(1))
axis(side = 1, at=fivenum(data$Customer_Age), labels = TRUE)

b. This will show the distribution of status of customers based on thier age.
age_status <- fivenum(data$Customer_Age)
age_status
## [1] 26 41 46 52 73
library(ggplot2)
ggplot(data, aes(x=Customer_Age)) +
  geom_histogram(color="black", fill="blue") + facet_grid(~Attrition_Flag)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

c- The distriution of age varying with 5 dependents
Dependents <- data[data$Dependent_count == '5',]
head(Dependents)
##     CLIENTNUM    Attrition_Flag Customer_Age Gender Dependent_count
## 2   818770008 Existing Customer           49      F               5
## 11  708790833 Existing Customer           42      M               5
## 59  711427458 Existing Customer           44      F               5
## 74  820582308 Existing Customer           42      M               5
## 141 789322833 Attrited Customer           48      F               5
## 156 713786508 Existing Customer           42      F               5
##     Education_Level Marital_Status Income_Category Card_Category Months_on_book
## 2          Graduate         Single  Less than $40K          Blue             44
## 11       Uneducated        Unknown         $120K +          Blue             31
## 59         Graduate        Married         Unknown          Blue             35
## 74       Uneducated        Married    $80K - $120K          Blue             37
## 141     High School        Married  Less than $40K          Blue             38
## 156         Unknown        Married     $40K - $60K          Blue             36
##     Total_Relationship_Count Months_Inactive Contacts_Count Credit_Limit
## 2                          6               1              2         8256
## 11                         5               3              2         6748
## 59                         4               1              2         6273
## 74                         6               2              2        22913
## 141                        1               3              3         8025
## 156                        3               3              3         2038
##     Total_Revolving_Bal Total_Amt_Chng_Q4_Q1 Total_Trans_Amt Total_Trans_Ct
## 2                   864                1.541            1291             33
## 11                 1467                0.831            1201             42
## 59                  978                2.275            1359             25
## 74                 1528                0.414            1394             35
## 141                   0                0.654             673             18
## 156                   0                0.786            1238             28
##     Total_Ct_Chng_Q4_Q1 Avg_Utilization_Ratio
## 2                 3.714                 0.105
## 11                0.680                 0.217
## 59                1.083                 0.156
## 74                0.522                 0.067
## 141               0.800                 0.000
## 156               0.750                 0.000
A <- Dependents$Customer_Age
B <- Dependents$Customer_Age
boxplot(A,B, xaxt = "n", xlab = "variations of age with 5 dependents", ylab = "Age of Customers", col=c("yellow", "green"))

d- Scatter Plot- Showing age of the customers, count of Dependents and the count of transactions
data2=data.frame(Age=data$Customer_Age,Dependent=data$Dependent_count,Transactions=data$Total_Trans_Ct)
head(data2,10)
##    Age Dependent Transactions
## 1   45         3           42
## 2   49         5           33
## 3   51         3           20
## 4   40         4           20
## 5   40         3           28
## 6   44         2           24
## 7   51         4           31
## 8   32         0           36
## 9   37         3           24
## 10  48         2           32
plot(data2 , pch=20 , cex=1.0 , col="lightblue")

e- Plotted a ggplot for age vs Dependent_count, and we have the points colored based on Education_Level
ggplot(data = data) +
  geom_point(mapping = aes(x = Customer_Age, y = Dependent_count, colour = Education_Level ))

library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
f - Mixed plot of age, months on book in the bank and contacts made by each customer
a <- plot_ly(data, x = data$Customer_Age, type="box", name = 'age')
b <-add_trace(a, x = data$Months_on_book, type="box", name = 'Months')
c <-add_trace(b , x = data$Contacts_Count,type = "box" , name ="Contacts_made" )
c
## Warning: `arrange_()` is deprecated as of dplyr 0.7.0.
## Please use `arrange()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
g - Analysis of distribution on Customers Age
values <- data$Customer_Age
tab <- table(values)
dframe <- as.data.frame(tab)
dframe
##    values Freq
## 1      26   78
## 2      27   32
## 3      28   29
## 4      29   56
## 5      30   70
## 6      31   91
## 7      32  106
## 8      33  127
## 9      34  146
## 10     35  184
## 11     36  221
## 12     37  260
## 13     38  303
## 14     39  333
## 15     40  361
## 16     41  379
## 17     42  426
## 18     43  473
## 19     44  500
## 20     45  486
## 21     46  490
## 22     47  479
## 23     48  472
## 24     49  495
## 25     50  452
## 26     51  398
## 27     52  376
## 28     53  387
## 29     54  307
## 30     55  279
## 31     56  262
## 32     57  223
## 33     58  157
## 34     59  157
## 35     60  127
## 36     61   93
## 37     62   93
## 38     63   65
## 39     64   43
## 40     65  101
## 41     66    2
## 42     67    4
## 43     68    2
## 44     70    1
## 45     73    1
x <- as.numeric(as.character(dframe$values))
x
##  [1] 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
## [26] 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 70 73

The probability distribution is:

f <- dframe$Freq / (sum(dframe$Freq))

The mean:

mu <- sum(x * f)
mu
## [1] 46.32596

Variance of the distribution is:

sigmaSquare <- sum((x - mu)^2 * f)
sigmaSquare
## [1] 64.26296
sigma <- sqrt(sigmaSquare)
sigma
## [1] 8.016418

Showing probability distribution of age:

plot(x, f, type = 'h', xlab = "Age", ylab = "PMF", ylim = c(0, 0.06), main
     = "Spike plot for Age")
abline(h = 0 )

Showing cumulative distribution of age:

cdf <- c(0, cumsum(f))
cdfplot <- stepfun(x, cdf)
plot(cdfplot, verticals=FALSE, pch=16, main="CDF Plot for Age", xlab = "Age", ylab = "C
DF")

h - Analysis on the distribution of Customer’s income
values2 <- data$Dependent_count
tab <- table(values2)
dframe <- as.data.frame(tab)
dframe
##   values2 Freq
## 1       0  904
## 2       1 1838
## 3       2 2655
## 4       3 2732
## 5       4 1574
## 6       5  424
x <- as.numeric(as.character(dframe$values2))
x
## [1] 0 1 2 3 4 5

The probability distribution is:

f <- dframe$Freq / (sum(dframe$Freq))

The mean:

mu <- sum(x * f)
mu
## [1] 2.346203

The variance of the distribution is:

sigmaSquare <- sum((x - mu)^2 * f)
sigmaSquare
## [1] 1.686996
sigma <- sqrt(sigmaSquare)
sigma
## [1] 1.298844

Showing probability distribution of Income category:

plot(x, f, type = 'h', xlab = "Income_Category", ylab = "PMF", ylim = c(0, 0.5), main
     = "Spike plot for Income")
abline(h = 0 )

Showing cumulative distribution of Income category

cdf <- c(0, cumsum(f))
cdfplot <- stepfun(x, cdf)
plot(cdfplot, verticals=FALSE, pch=16, main="CDF Plot for Income", xlab = "Age", ylab = "C
DF")

PART 4- Central Limit Theorem
Applying Central Limit Theorem on age
Age <-data$Customer_Age
ctable <- table(Age)

The mean is :

mu <- mean(Age)
mu
## [1] 46.32596
sigma <- sd(Age)
sigma
## [1] 8.016814

Histogram showing the age of the customers:

dframe <- as.data.frame(ctable)
x <- as.numeric(as.character(data$Customer_Age))
head(x)
## [1] 45 49 51 40 40 44
hist(x, probability = TRUE, xlim = c(0, 80), xlab = "Age", ylab = "Density", main = "Histogram of age")

Showing results with sample size 5
samples <- 1000
sample_size <- 5
xbar <- numeric(samples)
for(i in 1:samples){
  xbar[i] = mean(sample(x, size = sample_size, replace = T))
}
hist(xbar, prob = T, xlab = "Age",
     main = "Densities of age with sample size 5", col = "purple")

Mean:

mean1 <- mean(xbar)
mean1
## [1] 46.3198

Standard deviation

sd1 <- sd(xbar)
sd1
## [1] 3.640844
Showing results with sample size 20
samples <- 1000
sample_size <- 20
xbar <- numeric(samples)
for(i in 1:samples){
  xbar[i] = mean(sample(x, size = sample_size, replace = T))
}
hist(xbar, prob = T, xlab = "Age",
     main = "Densities of age with sample size 20", col = "red")

Mean:

mean2 <- mean(xbar)
mean2
## [1] 46.2087

Standard deviation

sd2 <- sd(xbar)
sd2
## [1] 1.781045
Showing results with sample size 50
samples <- 1000
sample_size <- 50
xbar <- numeric(samples)
for(i in 1:samples){
  xbar[i] = mean(sample(x, size = sample_size, replace = T))
}
hist(xbar, prob = T, xlab = "Age",
     main = "Densities of age with sample size 50", col = "brown")

Mean:

mean3 <- mean(xbar)
mean3
## [1] 46.31612

Standard deviation

sd3 <- sd(xbar)
sd3
## [1] 1.104157
cat("1st distribution:\nMean =",mean1,"\nSD =",sd1)
## 1st distribution:
## Mean = 46.3198 
## SD = 3.640844
cat("2nd distribution:\nMean =",mean2,"\nSD =",sd2)
## 2nd distribution:
## Mean = 46.2087 
## SD = 1.781045
cat("3rd distribution:\nMean =",mean3,"\nSD =",sd3)
## 3rd distribution:
## Mean = 46.31612 
## SD = 1.104157
INFERENCE: As we change our sample space and keep increasing it, we notice that the mean almost remains the same whereas the Standard deviation decreases hence proving the property of Central Limit Theorem
PART 5- Sampling
library(sampling)
table(data$Customer_Age)
## 
##  26  27  28  29  30  31  32  33  34  35  36  37  38  39  40  41  42  43  44  45 
##  78  32  29  56  70  91 106 127 146 184 221 260 303 333 361 379 426 473 500 486 
##  46  47  48  49  50  51  52  53  54  55  56  57  58  59  60  61  62  63  64  65 
## 490 479 472 495 452 398 376 387 307 279 262 223 157 157 127  93  93  65  43 101 
##  66  67  68  70  73 
##   2   4   2   1   1
hist(data$Customer_Age,col="yellow")

Using sampling methods for sample size of 200
Without replacement
sample.size <- 200
s <- srswor(sample.size,nrow(data))
sample.1 <- data[s != 0, ]
mean_srswor <- mean(sample.1$Customer_Age)
set.seed(153)
With replacement
s <- srswr(sample.size, nrow(data))
sample.2 <- data[s != 0, ]
mean_srswr <- mean(sample.2$Customer_Age)
set.seed(123)
Systematic Sampling
N <- nrow(data)
n <- 200
items in each group
k <- ceiling(N / n)
k
## [1] 51
random item from first group
r <- sample(k, 1)
r
## [1] 31
selecting every kth item
s <- seq(r, by = k, length = n)
head(s)
## [1]  31  82 133 184 235 286
sample.3 <- data[s, ]
table(sample.3$Customer_Age)
## 
## 27 28 29 31 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 
##  1  1  2  1  4  4  3  4  7  8  8  2 11 12  8 12  8  7  5 17  8  6  4  9  5  3 
## 55 56 57 58 59 60 61 62 64 65 
##  4 10  6  3  2  2  5  2  1  3
mean_systematic <- mean(sample.3$Customer_Age)
mean_systematic
## [1] NA
Systematic sampling with unequal probabilities
pik <- inclusionprobabilities(data$Customer_Age, sample.size)
s <- UPsystematic(pik)
sample.4 <- data[s != 0, ]
table(sample.4$Customer_Age)
## 
## 26 27 28 30 31 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 
##  1  1  2  4  2  2  2  3  5  3  3  5  5  7  4 16 10 10  7 10 10  9  8  7 12  8 
## 54 55 56 57 58 59 60 62 64 65 67 
##  9  4  6 10  3  3  2  2  1  3  1
Stratified Sampling
data["age_range"] = NA
data$age_range <- cut(data$Customer_Age, breaks = c(0, 25, 50, 75, Inf), labels = c('A', 'B', 'C','D'))
data_age <- data.frame(
  Customer_Age = data$Customer_Age,
  age_range = data$age_range
)
freq <- table(data_age$age_range)
freq
## 
##    A    B    C    D 
##    0 7049 3078    0
set.seed(123)
head(data_age)
##   Customer_Age age_range
## 1           45         B
## 2           49         B
## 3           51         C
## 4           40         B
## 5           40         B
## 6           44         B
Cluster sampling
cl <- cluster(data, c("Customer_Age"), size = 4, method = "srswor")
sample.6 <- getdata(data, cl)
table(sample.6$Customer_Age)
## 
##  28  39  40  56 
##  29 333 361 262
mean_cluster <- mean(sample.6$Customer_Age)
mean_cluster
## [1] 43.56447
Mean without replacement
mean_srswor
## [1] 45.82
Mean with replacement
mean_srswr
## [1] 46.0603
Systematic Mean
mean_systematic
## [1] NA
Using sampling methods for sample size of 700
Without replacement
sample.size <- 700
s <- srswor(sample.size,nrow(data))
sample.1 <- data[s != 0, ]
mean_srswor <- mean(sample.1$Customer_Age)
With replacement
set.seed(153)
s <- srswr(sample.size, nrow(data))
sample.2 <- data[s != 0, ]
mean_srswr <- mean(sample.2$Customer_Age)
Systematic Sampling
N <- nrow(data)
n <- 700
items in each group
k <- ceiling(N / n)
k
## [1] 15
random items in first group
r <- sample(k, 1)
r
## [1] 10
selecting every kth item
s <- seq(r, by = k, length = n)
head(s)
## [1] 10 25 40 55 70 85
sample.3 <- data[s, ]
table(sample.3$Customer_Age)
## 
## 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 
##  5  2  3  1  7  3  7  4 15  6 10 20 21 23 21 33 29 21 38 40 36 27 38 32 26 23 
## 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 
## 21 27 22 12 18 18 18 11 11  7  5  4  2  7  1
mean_systematic <- mean(sample.3$Customer_Age)
Systematic Sampling with unequal probabilities
pik <- inclusionprobabilities(data$Customer_Age, sample.size)
s <- UPsystematic(pik)
sample.4 <- data[s != 0, ]
table(sample.4$Customer_Age)
## 
## 26 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 
##  2  1  5  5  3  5  9  8 16 12 19 16 18 17 20 23 34 27 42 26 36 36 42 39 35 25 
## 53 54 55 56 57 58 59 60 61 62 63 64 65 67 
## 20 23 29 26 14  8 14 10 12  9  2  3  8  1
Stratified sampling
freq <- table(data_age$age_range)
freq
## 
##    A    B    C    D 
##    0 7049 3078    0
set.seed(123)
head(data_age)
##   Customer_Age age_range
## 1           45         B
## 2           49         B
## 3           51         C
## 4           40         B
## 5           40         B
## 6           44         B
Cluster mean
mean_cluster
## [1] 43.56447
Systematic mean
mean_systematic
## [1] NA
Mean With replacement
mean_srswr
## [1] 46.47563
Mean Without replacement
mean_srswor
## [1] 46.24